package org.hscoder.websearcher.dytt;

import com.fasterxml.jackson.core.type.TypeReference;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.hscoder.websearcher.domain.ConditionTags;
import org.hscoder.websearcher.domain.FilmData;
import org.hscoder.websearcher.domain.TagScore;
import org.hscoder.websearcher.util.JsonUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.CollectionUtils;

import javax.swing.text.html.HTML;
import java.io.File;
import java.io.IOException;
import java.util.*;

/**
 * 抽取电影分类条件，用于检索
 */
public class DyttTagsFetcher {

    private static final Logger logger = LoggerFactory.getLogger(DyttTagsFetcher.class);


    /**
     * 从本地目录的数据文件中加载标签
     *
     * @param detailDir
     * @return
     */
    public static ConditionTags loadFromLocal(File detailDir) {

        List<FilmData> esFilms = new ArrayList<>();

        ConditionTags tags = new ConditionTags();

        Map<String, TagScore> sorts = new HashMap<>();
        Map<String, TagScore> languages = new HashMap<>();
        Map<String, TagScore> periods = new HashMap<>();
        Map<String, TagScore> regions = new HashMap<>();

        //遍历每个数据文件
        for (File file : detailDir.listFiles()) {

            try {
                String pageContent = FileUtils.readFileToString(file, "utf-8");

                //解析为FilmData对象
                List<FilmData> pageFilms = JsonUtil.fromJson(pageContent, new TypeReference<List<FilmData>>() {
                });

                if (pageFilms != null) {
                    pageFilms.stream().forEach(f -> {
                        if (!StringUtils.isEmpty(f.getName())) {

                            if (!StringUtils.isEmpty(f.getLanguage())) {
                                addTags(languages, f.getLanguage());
                            }

                            if (!StringUtils.isEmpty(f.getPeriod())) {
                                addTags(periods, f.getPeriod());
                            }

                            if (!CollectionUtils.isEmpty(f.getRegions())) {
                                addTags(regions, f.getRegions().toArray(new String[0]));
                            }

                            if (!CollectionUtils.isEmpty(f.getSorts())) {
                                addTags(sorts, f.getSorts().toArray(new String[0]));
                            }
                        }
                    });
                }
            } catch (IOException e) {
                logger.error("read file {} failed", file.getAbsolutePath(), e);
            }

        }

        //执行结果排序，按得分降序排列
        List<TagScore> languageList = new ArrayList<>(languages.values());
        List<TagScore> periodList = new ArrayList<>(periods.values());
        List<TagScore> regionList = new ArrayList<>(regions.values());
        List<TagScore> sortList = new ArrayList<>(sorts.values());

        Collections.sort(languageList);
        Collections.sort(periodList);
        Collections.sort(regionList);
        Collections.sort(sortList);

        tags.setLanguages(languageList);
        tags.setPeriods(periodList);
        tags.setRegions(regionList);
        tags.setSorts(sortList);
        return tags;
    }

    //向标签表Map对象中记录
    private static void addTags(Map<String, TagScore> tagMap, String... tags) {

        for (String tag : tags) {
            //若存在则计分增加
            if (tagMap.containsKey(tag)) {
                TagScore tagScore = tagMap.get(tag);
                tagScore.setScore(tagScore.getScore() + 1);
            } else {
                tagMap.put(tag, new TagScore(tag, 1));
            }
        }
    }

    public static void main(String[] args) throws IOException {

        File detailDir = new File("D:/temp/dytt/detail");
        ConditionTags tags = loadFromLocal(detailDir);

        String jsonTags = JsonUtil.toPrettyJson(tags);
        System.out.println(jsonTags);

        //保存到文件
        FileUtils.write(new File(detailDir.getParent(), "dytt.tags"), jsonTags, "UTF-8");
    }

}
